library(dplyr)
library(magrittr)
library(highcharter)
wdiEc <- read.csv("~/University/5th Semester/Data Analysis/Assignments/HW8/data/wdiEc.csv")
wdiHe <-read.csv("~/University/5th Semester/Data Analysis/Assignments/HW8/data/wdiHe.csv")
wdiEd <-read.csv("~/University/5th Semester/Data Analysis/Assignments/HW8/data/wdiEd.csv")
NotCountry <- c("WLD","HIC","OED","ECS","EUU","LMY","MIC","EMU","EAS",
"NAC","UMC","EAP","LMC","LCN","ECA","LAC","MEA","ARB",
"CEB","MNA","SSF","SSA","LDC","SST","FCS","OSS","HPC",
"LIC")
filter(wdiEc,!(Country.Code %in% NotCountry)) -> wdiEc
filter(wdiEd,!(Country.Code %in% NotCountry)) -> wdiEd
filter(wdiHe,!(Country.Code %in% NotCountry)) -> wdiHeThe following Indicators are compared in the year 2013.
GDP per capita (current US$)/NY.GDP.PCAP.CD
GDP growth (annual %)/ NY.GDP.MKTP.KD.ZG
Inflation, consumer prices (annual %)/FP.CPI.TOTL.ZG
Technical cooperation grants (BoP, current US$)/BX.GRT.TECH.CD.WD
Total debt service (% of exports of goods, services and primary income)/DT.TDS.DECT.EX.ZS
Indicators = c("NY.GDP.PCAP.CD","NY.GDP.MKTP.KD.ZG","FP.CPI.TOTL.ZG","BX.GRT.TECH.CD.WD","DT.TDS.DECT.EX.ZS")
wdiEc %>% filter(Indicator.Code %in% Indicators, !is.na(X2013)) ->EcGrIran is Ranked 99 amongst 212 countries in the world.
EcGr %>% filter(Indicator.Code == "NY.GDP.PCAP.CD") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "GDP Per Capita (US$)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 199 amongst 212 countries in the world.
EcGr %>% filter(Indicator.Code == "NY.GDP.MKTP.KD.ZG") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "GDP growth (annual %)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 2 amongst 212 countries in the world. No Surprise!
EcGr %>% filter(Indicator.Code == "FP.CPI.TOTL.ZG") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Inflation, consumer prices (annual %)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 53 amongst 212 countries in the world.
EcGr %>% filter(Indicator.Code == "BX.GRT.TECH.CD.WD") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Technical cooperation grants (BoP, current US$)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 107 amongst 212 countries in the world.
EcGr %>% filter(Indicator.Code == "DT.TDS.DECT.EX.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Total debt service (% of exports of goods, services and primary income)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))We calculate the mean value over the years for each indicator of each country, creating a new dataframe. Then we’ll reshape it to calculate the PCs.
# averaging on all the years
Values <- rowMeans(wdiEc[,5:60],na.rm = TRUE)
wdiEc2<- data.frame(wdiEc$Country.Code,wdiEc$Country.Name,wdiEc$Indicator.Code,Values)
names(wdiEc2) <- c("Country.Code","Country.Name","Indicator.Code","Values")
# reshaping the dataset
require(tidyr)
wdiEc2 %>% spread(Indicator.Code,Values)->wdiEc2
Names = wdiEc2[,1]
wdiEc2[is.na(wdiEc2)]=0
row.names(wdiEc2) = wdiEc2$Country.Code
#calculating PCs
PCmat <- prcomp(wdiEc2[,-c(1,2)], scale. = T, center = T)
plot(summary(PCmat)$importance[3,], type="l",
ylab="%variance explained",
xlab="nth component (decreasing order)")
abline(h=0.8, col="indianred")We’ll be keeping 19 PCs, describing about 80% of the data’s variance.
# choosing first 19 PCs, and showing the feature vector
chosen.components = 1:19
feature.vector = PCmat$rotation[,chosen.components]
knitr:: kable(feature.vector[1:10,1:5])| PC1 | PC2 | PC3 | PC4 | PC5 | |
|---|---|---|---|---|---|
| BG.GSR.NFSV.GD.ZS | 0.0125849 | -0.0110843 | -0.0514514 | 0.0106519 | 0.0857140 |
| BM.GSR.CMCP.ZS | -0.0118322 | 0.0026685 | -0.0660256 | 0.0940998 | 0.0333804 |
| BM.GSR.FCTY.CD | -0.1196605 | -0.0013176 | 0.0263301 | -0.0003438 | 0.0367208 |
| BM.GSR.GNFS.CD | -0.1275100 | 0.0010212 | 0.0263476 | 0.0122738 | 0.0289926 |
| BM.GSR.INSF.ZS | -0.0048009 | -0.0057565 | -0.0556014 | 0.0180324 | 0.0384153 |
| BM.GSR.MRCH.CD | -0.1275197 | 0.0011001 | 0.0271734 | 0.0130584 | 0.0265560 |
| BM.GSR.NFSV.CD | -0.1256830 | 0.0005570 | 0.0257901 | 0.0106707 | 0.0391103 |
| BM.GSR.ROYL.CD | -0.1052432 | 0.0030626 | -0.0165613 | -0.0037154 | 0.0263410 |
| BM.GSR.TOTL.CD | -0.1275161 | 0.0006523 | 0.0249218 | 0.0099674 | 0.0295785 |
| BM.GSR.TRAN.ZS | 0.0061086 | -0.0026510 | 0.0036186 | 0.1374019 | -0.0278359 |
wdiEcomy = cbind(Country.Code= wdiEc2$Country.Code,Country.Name = wdiEc2$Country.Name,data.frame(PCmat$x))
# Rank according to the first and second PC
wdiEcomy[,"Ranking1"]= rank(-wdiEcomy$PC1,ties.method = "min")
wdiEcomy[,"Ranking2"]= rank(-wdiEcomy$PC2,ties.method = "min")Now let’s see how the first pc ranks the countries:
IransRank = filter(wdiEcomy,Country.Code=="IRN")$Ranking1
hchart( wdiEcomy, type = "column", x= Country.Name, y=Ranking1, group = Ranking1) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "PC1") %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1))) %>%
hc_legend(enabled=F)Iran is ranked 147 amongst 236 countries. the First countries are weaker (ecomony-wise) compared to the last ones. For example the first country is liberia and the last one is the United states. We can see that the first PC has saved the variance really well.
The first 20 countries according to the second PC are ranked below.
IransRank = filter(wdiEcomy,Country.Code=="IRN")$Ranking2
wdiEcomy = wdiEcomy[order(wdiEcomy$Ranking2),]
hchart( wdiEcomy[1:20,], type = "column", x= Country.Name[1:20], y=Ranking2[1:20], group = Ranking2[1:20]) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "PC2") %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1))) %>%
hc_legend(enabled=F)library(cluster)
library(fpc)
#Clustering on the first 19 pcs
clus <- kmeans(wdiEcomy[,3:22], centers=5)
clusplot(wdiEcomy[,3:22], clus$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)Let’s find out about the countires inside each cluster.
k5Clus <- data.frame(clus$cluster)
k5Clus <- data.frame(rownames(k5Clus),clus$cluster)
names(k5Clus) <- c("Country.Code","Cluster")
countrynames = select(wdiEc2, Country.Name, Country.Code)
k5Clus = merge (countrynames, k5Clus, by = "Country.Code")
hchart(k5Clus, type = "column", x=Country.Name, y=Cluster, group = Cluster) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Members of Each Cluster regarding Economy") %>%
hc_xAxis(title = list(text = "Country Name")) %>%
hc_yAxis(title = list(text = "Cluster"))The countries similar to Iran Economical-wise are:
n <- k5Clus$Cluster[which(k5Clus$Country.Code=="IRN")]
k5Clus %>% filter(.,Cluster==n) -> temp
knitr::kable(temp)| Country.Code | Country.Name | Cluster |
|---|---|---|
| IDN | Indonesia | 3 |
| IRN | Iran, Islamic Rep. | 3 |
| VNM | Vietnam | 3 |
we reduce the data to the first PC. I’ll define Growth as weighted sum over the indicator differences in two consecutive years, thus, the weight of an indicator becomes it’s corresponding value in PC1. Note that the weight of each indicator in PC1, somehow show’s it’s importance in economical state of a country.
Let’s see the growth factor explained above for Iran, USA and iceland.
wdiEc %>% filter(Country.Code=="IRN") -> IranEc
IranEc[is.na(IranEc)] <- 0
wdiEc %>% filter(Country.Code=="ISL") -> IceEc
IceEc[is.na(IceEc)] <- 0
wdiEc %>% filter(Country.Code=="USA") -> USEc
USEc[is.na(USEc)] <- 0
weights <- PCmat$rotation[,1]
IranEc <- (-(weights %*% as.matrix(IranEc[,6:58])) + (weights %*% as.matrix(IranEc[,5:57])))
USEc <- -(weights %*% as.matrix(USEc[,6:58])) + (weights %*% as.matrix(USEc[,5:57]))
IceEc <- -(weights %*% as.matrix(IceEc[,6:58])) + (weights %*% as.matrix(IceEc[,5:57]))
highchart() %>%
hc_xAxis(categories = 1960:2012) %>%
hc_add_series(name = "IRAN",data = t(IranEc)) %>%
hc_add_series(name = "USA",data = t(USEc)) %>%
hc_add_series(name = "Iceland",data = t(IceEc)) %>%
hc_title(text = "Economical Growth") %>%
hc_add_theme(hc_theme_economist())We can see that Iceland’s economy has been quite steady. United States has had a fall in 2008 (the housing crisis) and Iran has gotten worse overtime.
Back to the regression model, I’ll compute the growth between years 2009-2013, and then test it on years 2013/2014.
#selecting the columns
EcGrowth0 <- wdiEc[,c(1:4,54:60)]
EcGrowth0[is.na(EcGrowth0)] <- 0
numberOfCountries = length(levels(factor(EcGrowth0$Country.Code)))
#creating an empty dataset to fill it with growth values later
EcGrowth1 = matrix(nrow = numberOfCountries,ncol = 8)
EcGrowth1 = as.data.frame(EcGrowth1)
colnames(EcGrowth1) <- colnames(EcGrowth0)[c(2,5:11)]
EcGrowth1[,"Country.Code"] = levels(factor(EcGrowth0$Country.Code))
# calculating the annual growth for each country
for (i in 1:numberOfCountries) {
temp <- filter(EcGrowth0,Country.Code==as.character(levels(factor(wdiEc$Country.Code))[i]))
EcGrowth1[i,2:8] <- - weights %*% as.matrix(temp[,5:11])
}
#fitting the linear model
EcGrowth1 %>% mutate(Growth = 2*(X2014-X2013)) -> EcGrowth1
EcGrowth1[is.na(EcGrowth1)] <- 0
EcGrowthFit <- lm(Growth~X2009+X2010+X2011+X2012+X2013,EcGrowth1)
summary(EcGrowthFit)##
## Call:
## lm(formula = Growth ~ X2009 + X2010 + X2011 + X2012 + X2013,
## data = EcGrowth1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.512e+12 8.383e+10 1.041e+11 1.067e+11 2.816e+12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.048e+11 5.037e+10 -2.080 0.03867 *
## X2009 -2.332e-01 7.170e-02 -3.253 0.00131 **
## X2010 1.902e-01 9.881e-02 1.925 0.05548 .
## X2011 3.383e-02 2.902e-01 0.117 0.90730
## X2012 -1.542e+00 2.239e-01 -6.886 5.43e-11 ***
## X2013 1.536e+00 7.277e-02 21.112 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.56e+11 on 230 degrees of freedom
## Multiple R-squared: 0.9599, Adjusted R-squared: 0.959
## F-statistic: 1100 on 5 and 230 DF, p-value: < 2.2e-16
Let’s predict iran’s ranking in 2014/2015 growth factor.
# loading the test data and changing the names so that the years are similar to the model we used
testdata<- EcGrowth1[,3:7]
colnames(testdata) <- colnames(EcGrowth1)[2:6]
EcGrowth1 <- data.frame(EcGrowth1,predict(EcGrowthFit,testdata))
EcGrowth1<- EcGrowth1[order(desc(EcGrowth1$predict.EcGrowthFit..testdata.)),]
which(EcGrowth1$Country.Code=="IRN")## [1] 236
It’s the last country. What a surprise.
The following Indicators are compared in the year 2013.
Number of under-five deaths / SH.DTH.MORT
Adults (ages 15+) newly infected with HIV / SH.HIV.INCD
Health expenditure per capita (current US$) / SH.XPD.PCAP
Population ages 65 and above (% of total) / SP.POP.65UP.TO.ZS
Life expectancy at birth, total (years) / SP.DYN.LE00.IN
Indicators = c("SH.DTH.MORT","SH.HIV.INCD","SH.XPD.PCAP","SP.POP.65UP.TO.ZS","SP.DYN.LE00.IN")
wdiHe %>% filter(Indicator.Code %in% Indicators, !is.na(X2013)) ->HealthIran is Ranked 47 amongst 193 countries in the world.
Health %>% filter(Indicator.Code == "SH.DTH.MORT") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Number of under-five deaths") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 33 amongst 193 countries in the world.
Health %>% filter(Indicator.Code == "SH.HIV.INCD") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Adults (ages 15+) newly infected with HIV") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 96 amongst 193 countries in the world.
Health %>% filter(Indicator.Code == "SH.XPD.PCAP") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Health expenditure per capita (current US$)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 123 amongst 193 countries in the world.
Health %>% filter(Indicator.Code == "SP.POP.65UP.TO.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Population ages 65 and above (% of total)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 75 amongst 193 countries in the world.
Health %>% filter(Indicator.Code =="SP.DYN.LE00.IN") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Life expectancy at birth, total (years)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Similar to the Economics part we have:
# averaging on all the years
Values <- rowMeans(wdiHe[,5:60],na.rm = TRUE)
wdiHe2<- data.frame(wdiHe$Country.Code,wdiHe$Country.Name,wdiHe$Indicator.Code,Values)
names(wdiHe2) <- c("Country.Code","Country.Name","Indicator.Code","Values")
# reshaping the dataset
require(tidyr)
wdiHe2 %>% spread(Indicator.Code,Values)->wdiHe2
Names = wdiHe2[,1]
wdiHe2[is.na(wdiHe2)]=0
row.names(wdiHe2) = wdiHe2$Country.Code
#calculating PCs
PCmatHe <- prcomp(wdiHe2[,-c(1,2)], scale. = T, center = T)
plot(summary(PCmatHe)$importance[3,], type="l",
ylab="%variance explained",
xlab="nth component (decreasing order)")
abline(h=0.8, col="indianred")We’ll be keeping 16 PCs, describing about 80% of the data’s variance.
# choosing first 16 PCs, and showing the feature vector
chosen.components = 1:16
feature.vector = PCmatHe$rotation[,chosen.components]
knitr:: kable(feature.vector[1:10,1:10])| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| SG.VAW.1549.ZS | 0.0566705 | 0.0129454 | 0.0400020 | -0.0418096 | 0.0613298 | -0.0657096 | 0.0325744 | -0.0341940 | -0.0810918 | 0.1388954 |
| SG.VAW.ARGU.ZS | 0.0957454 | 0.0024007 | 0.0678380 | 0.0742735 | 0.0947387 | 0.0148493 | 0.0051016 | 0.1135740 | -0.2454665 | -0.0034066 |
| SG.VAW.BURN.ZS | 0.0924884 | -0.0012441 | 0.0585134 | 0.0840545 | 0.0940960 | 0.0105724 | 0.0040134 | 0.1002811 | -0.2354228 | 0.0132317 |
| SG.VAW.GOES.ZS | 0.0975128 | 0.0018089 | 0.0672851 | 0.0825888 | 0.0889987 | 0.0096868 | -0.0013800 | 0.1248553 | -0.2538857 | 0.0003699 |
| SG.VAW.NEGL.ZS | 0.0988371 | 0.0080845 | 0.0756527 | 0.0715211 | 0.0750927 | 0.0078051 | 0.0029061 | 0.1366565 | -0.2463302 | 0.0207760 |
| SG.VAW.REAS.ZS | 0.1108268 | 0.0035733 | 0.0650819 | 0.0996065 | 0.0598048 | 0.0086884 | -0.0127148 | 0.0786277 | -0.1087742 | -0.0358202 |
| SG.VAW.REFU.ZS | 0.0932232 | -0.0052615 | 0.0569789 | 0.0917695 | 0.1004684 | 0.0121666 | -0.0268809 | 0.1007971 | -0.2493176 | -0.0335532 |
| SH.ANM.ALLW.ZS | 0.1078361 | 0.0822623 | -0.0796794 | 0.0110717 | -0.0171977 | -0.0024644 | -0.0390208 | -0.0206622 | -0.0762716 | 0.0170956 |
| SH.ANM.CHLD.ZS | 0.1216283 | 0.0581052 | -0.0734171 | 0.0033365 | 0.0127704 | -0.0367793 | -0.0484936 | -0.0121745 | -0.0568834 | 0.0124750 |
| SH.ANM.NPRG.ZS | 0.1073305 | 0.0883522 | -0.0787823 | 0.0096301 | -0.0096217 | -0.0307600 | -0.0253603 | -0.0183766 | -0.0722821 | 0.0105705 |
wdiHealth = cbind(Country.Code= wdiHe2$Country.Code,Country.Name = wdiHe2$Country.Name,data.frame(PCmatHe$x))
# Rank according to the first and second PC
wdiHealth[,"Ranking1"]= rank(-wdiHealth$PC1,ties.method = "min")
wdiHealth[,"Ranking2"]= rank(-wdiHealth$PC2,ties.method = "min")Now let’s see how the first pc ranks the countries:
IransRank = filter(wdiHealth,Country.Code=="IRN")$Ranking1
hchart( wdiHealth, type = "column", x= Country.Name, y=Ranking1, group = Ranking1) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "PC1") %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1))) %>%
hc_legend(enabled=F)Iran is ranked 109 amongst 236 countries. This is really amazing, almost all of the first countries are located in africa and are in a bad hygenic situation compared to the last ones which are well developed rich countries such as switzerland and sweden. This shows that the first PC has saved the variance really well.
The first 20 countries according to the second PC are ranked below.
IransRank = filter(wdiHealth,Country.Code=="IRN")$Ranking2
wdiHealth = wdiHealth[order(wdiHealth$Ranking2),]
hchart( wdiHealth[1:20,], type = "column", x= Country.Name[1:20], y=Ranking2[1:20], group = Ranking2[1:20]) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "PC2") %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1))) %>%
hc_legend(enabled=F)library(cluster)
library(fpc)
#Clustering on the first 16 pcs
clus <- kmeans(wdiHealth[,3:19], centers=5)
clusplot(wdiHealth[,3:22], clus$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)Let’s find out about the countires inside each cluster. This is just amazing! One of the clusters consists of only african countries!
k5Clus <- data.frame(clus$cluster)
k5Clus <- data.frame(rownames(k5Clus),clus$cluster)
names(k5Clus) <- c("Country.Code","Cluster")
countrynames = select(wdiHe2, Country.Name, Country.Code)
k5Clus = merge (countrynames, k5Clus, by = "Country.Code")
hchart(k5Clus, type = "column", x=Country.Name, y=Cluster, group = Cluster) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Members of Each Cluster Regardin Health") %>%
hc_xAxis(title = list(text = "Country Name")) %>%
hc_yAxis(title = list(text = "Cluster"))The countries similar to Iran Health-wise are:
n <- k5Clus$Cluster[which(k5Clus$Country.Code=="IRN")]
k5Clus %>% filter(.,Cluster==n) -> temp
knitr::kable(temp[1:10,])| Country.Code | Country.Name | Cluster |
|---|---|---|
| ALB | Albania | 4 |
| ARG | Argentina | 4 |
| ARM | Armenia | 4 |
| AZE | Azerbaijan | 4 |
| BIH | Bosnia and Herzegovina | 4 |
| BLZ | Belize | 4 |
| BOL | Bolivia | 4 |
| BRA | Brazil | 4 |
| BRB | Barbados | 4 |
| BTN | Bhutan | 4 |
we reduce the data to the first PC. Growth factor is similar to what was defined before. Let’s see the growth factor explained above for Iran, USA and iceland.
wdiHe %>% filter(Country.Code=="IRN") -> IranHe
IranHe[is.na(IranHe)] <- 0
wdiHe %>% filter(Country.Code=="ISL") -> IceHe
IceHe[is.na(IceHe)] <- 0
wdiHe %>% filter(Country.Code=="USA") -> USHe
USHe[is.na(USHe)] <- 0
weights <- PCmatHe$rotation[,1]
IranHe <- (-(weights %*% as.matrix(IranHe[,6:58])) + (weights %*% as.matrix(IranHe[,5:57])))
USHe <- -(weights %*% as.matrix(USHe[,6:58])) + (weights %*% as.matrix(USHe[,5:57]))
IceHe <- -(weights %*% as.matrix(IceHe[,6:58])) + (weights %*% as.matrix(IceHe[,5:57]))
highchart() %>%
hc_xAxis(categories = 1960:2012) %>%
hc_add_series(name = "IRAN",data = t(IranHe)) %>%
hc_add_series(name = "USA",data = t(USHe)) %>%
hc_add_series(name = "Iceland",data = t(IceHe)) %>%
hc_title(text = "Health Growth") %>%
hc_add_theme(hc_theme_economist())Again, Iceland is an steady (and beautiful country), United States Health growth oscillates and apparently Iran has been getting better in the last couple of years.
Back to the regression model, I’ll compute the growth between years 2009-2013, and then test it on years 2013/2014.
#selecting the columns
HeGrowth0 <- wdiHe[,c(1:4,54:60)]
HeGrowth0[is.na(HeGrowth0)] <- 0
numberOfCountries = length(levels(factor(HeGrowth0$Country.Code)))
#creating an empty dataset to fill it with growth values later
HeGrowth1 = matrix(nrow = numberOfCountries,ncol = 8)
HeGrowth1 = as.data.frame(HeGrowth1)
colnames(HeGrowth1) <- colnames(HeGrowth0)[c(2,5:11)]
HeGrowth1[,"Country.Code"] = levels(factor(HeGrowth0$Country.Code))
# calculating the annual growth for each country
for (i in 1:numberOfCountries) {
temp <- filter(HeGrowth0,Country.Code==as.character(levels(factor(wdiHe$Country.Code))[i]))
HeGrowth1[i,2:8] <- - weights %*% as.matrix(temp[,5:11])
}
#fitting the linear model
HeGrowth1 %>% mutate(Growth = 2*(X2014-X2013)) -> HeGrowth1
HeGrowth1[is.na(HeGrowth1)] <- 0
EcGrowthFit <- lm(Growth~X2009+X2010+X2011+X2012+X2013,HeGrowth1)
summary(EcGrowthFit)##
## Call:
## lm(formula = Growth ~ X2009 + X2010 + X2011 + X2012 + X2013,
## data = HeGrowth1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30192.0 -82.4 254.7 556.6 27717.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.586e+02 2.986e+02 -0.866 0.387
## X2009 2.200e-02 5.927e-02 0.371 0.711
## X2010 3.753e-02 8.598e-03 4.365 1.92e-05 ***
## X2011 -1.431e+00 1.199e-01 -11.940 < 2e-16 ***
## X2012 -1.699e-01 2.624e-02 -6.473 5.75e-10 ***
## X2013 1.539e+00 7.475e-02 20.593 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4411 on 230 degrees of freedom
## Multiple R-squared: 0.9938, Adjusted R-squared: 0.9937
## F-statistic: 7421 on 5 and 230 DF, p-value: < 2.2e-16
Let’s predict iran’s ranking in 2014/2015 growth factor.
# loading the test data and changing the names so that the years are similar to the model we used
testdata<- HeGrowth1[,3:7]
colnames(testdata) <- colnames(HeGrowth1)[2:6]
HeGrowth1 <- data.frame(HeGrowth1,predict(EcGrowthFit,testdata))
HeGrowth1<- HeGrowth1[order(desc(HeGrowth1$predict.EcGrowthFit..testdata.)),]
which(HeGrowth1$Country.Code=="IRN")## [1] 120
Iran is ranked 120 amongst 236 countries. The first and last countries are Bangladesh and USA!
Indicators = c("SE.ADT.1524.LT.ZS","SE.COM.DURS","SE.PRM.NENR","SE.XPD.TOTL.GD.ZS","SL.UEM.TOTL.ZS")
wdiEd %>% filter(Indicator.Code %in% Indicators, !is.na(X2013)) ->EducationIran is Ranked 18 amongst 193 countries in the world.
Education %>% filter(Indicator.Code == "SE.ADT.1524.LT.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Youth literacy rate, population 15-24 years, both sexes") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 159 amongst 193 countries in the world.
Education %>% filter(Indicator.Code == "SE.COM.DURS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Duration of compulsory education (years)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 15 amongst 193 countries in the world.
Education %>% filter(Indicator.Code == "SE.PRM.NENR") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Net enrolment rate, primary, both sexes (%)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 64 amongst 193 countries in the world.
Education %>% filter(Indicator.Code == "SE.XPD.TOTL.GD.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Government expenditure on education, total (% of GDP)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Iran is Ranked 35 amongst 193 countries in the world.
Education %>% filter(Indicator.Code == "SL.UEM.TOTL.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Unemployment, total (% of total labor force)") %>%
hc_legend(enabled=F) %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1)))Similar to the Economics part we have:
# averaging on all the years
Values <- rowMeans(wdiEd[,5:60],na.rm = TRUE)
wdiEd2<- data.frame(wdiEd$Country.Code,wdiEd$Country.Name,wdiEd$Indicator.Code,Values)
names(wdiEd2) <- c("Country.Code","Country.Name","Indicator.Code","Values")
# reshaping the dataset
require(tidyr)
wdiEd2 %>% spread(Indicator.Code,Values)->wdiEd2
Names = wdiEd2[,1]
wdiEd2[is.na(wdiEd2)]=0
row.names(wdiEd2) = wdiEd2$Country.Code
#calculating PCs
PCmatEd <- prcomp(wdiEd2[,-c(1,2)], scale. = T, center = T)
plot(summary(PCmatEd)$importance[3,], type="l",
ylab="%variance explained",
xlab="nth component (decreasing order)")
abline(h=0.8, col="indianred")We’ll be keeping 18 PCs, describing about 80% of the data’s variance.
# choosing first 18 PCs, and showing the feature vector
chosen.components = 1:18
feature.vector = PCmatEd$rotation[,chosen.components]
knitr:: kable(feature.vector[1:10,1:10])| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| SE.ADT.1524.LT.FE.ZS | -0.0466829 | -0.0903742 | 0.0485449 | -0.0815209 | 0.0021185 | -0.2575296 | 0.0788784 | -0.1169873 | 0.0115377 | -0.0861005 |
| SE.ADT.1524.LT.FM.ZS | -0.0361481 | -0.0938534 | -0.0412844 | -0.0320101 | -0.0366465 | -0.2703352 | 0.0379462 | -0.1119921 | 0.0006710 | -0.0990561 |
| SE.ADT.1524.LT.MA.ZS | -0.0368997 | -0.1036704 | 0.0508523 | -0.0648474 | 0.0171239 | -0.2596017 | 0.0743038 | -0.0933686 | 0.0052279 | -0.0874504 |
| SE.ADT.1524.LT.ZS | -0.0420739 | -0.0971189 | 0.0500969 | -0.0739121 | 0.0095893 | -0.2593373 | 0.0769504 | -0.1053578 | 0.0085523 | -0.0873551 |
| SE.ADT.LITR.FE.ZS | -0.0590757 | -0.0679555 | 0.0454932 | -0.0985054 | -0.0106339 | -0.2455876 | 0.0842363 | -0.1304639 | 0.0168291 | -0.0751854 |
| SE.ADT.LITR.MA.ZS | -0.0459289 | -0.0919128 | 0.0515688 | -0.0776737 | 0.0075393 | -0.2557143 | 0.0779425 | -0.1007470 | 0.0129747 | -0.0877970 |
| SE.ADT.LITR.ZS | -0.0529924 | -0.0802100 | 0.0490544 | -0.0893385 | -0.0015594 | -0.2521274 | 0.0817540 | -0.1161200 | 0.0151838 | -0.0825169 |
| SE.COM.DURS | -0.0763946 | 0.0302383 | 0.0193303 | 0.0163701 | 0.0128389 | 0.0555283 | 0.0430003 | -0.1051711 | 0.0098574 | 0.0296149 |
| SE.ENR.PRIM.FM.ZS | -0.1150408 | -0.0764697 | 0.0584873 | 0.0317977 | -0.0432859 | 0.0333286 | -0.1245975 | -0.0451965 | -0.0026453 | 0.0460988 |
| SE.ENR.PRSC.FM.ZS | -0.1183980 | -0.0684360 | 0.0530856 | 0.0133121 | -0.0464599 | 0.0437321 | -0.0841046 | -0.0412218 | 0.0006640 | 0.0794846 |
wdiEducate = cbind(Country.Code= wdiEd2$Country.Code,Country.Name = wdiEd2$Country.Name,data.frame(PCmatEd$x))
# Rank according to the first and second PC
wdiEducate[,"Ranking1"]= rank(-wdiEducate$PC1,ties.method = "min")
wdiEducate[,"Ranking2"]= rank(-wdiEducate$PC2,ties.method = "min")Now let’s see how the first pc ranks the countries:
IransRank = filter(wdiEducate,Country.Code=="IRN")$Ranking1
hchart( wdiEducate, type = "column", x= Country.Name, y=Ranking1, group = Ranking1) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "PC1") %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1))) %>%
hc_legend(enabled=F)Iran is ranked 150 amongst 236 countries. the First countries are rather small countries and the last ones are almost all europian. my guess is that this ranking has to do something with the school systems.
The first 20 countries according to the second PC are ranked below.
IransRank = filter(wdiEducate,Country.Code=="IRN")$Ranking2
wdiEducate = wdiEducate[order(wdiEducate$Ranking2),]
hchart( wdiEducate[1:20,], type = "column", x= Country.Name[1:20], y=Ranking2[1:20], group = Ranking2[1:20]) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "PC2") %>%
hc_xAxis(title = list(text = "Country Name"),
plotLines = list(
list(label = list(text = "Iran"),
color = "#1f618d",
width = 2,
value = IransRank-1))) %>%
hc_legend(enabled=F)library(cluster)
library(fpc)
#Clustering on the first 18 pcs
clus <- kmeans(wdiEducate[,3:21], centers=5)
clusplot(wdiEducate[,3:22], clus$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)Let’s find out about the countires inside each cluster.
k5Clus <- data.frame(clus$cluster)
k5Clus <- data.frame(rownames(k5Clus),clus$cluster)
names(k5Clus) <- c("Country.Code","Cluster")
countrynames = select(wdiEd2, Country.Name, Country.Code)
k5Clus = merge (countrynames, k5Clus, by = "Country.Code")
hchart(k5Clus, type = "column", x=Country.Name, y=Cluster, group = Cluster) %>%
hc_add_theme(hc_theme_economist()) %>%
hc_title(text = "Members of Each Cluster Regarding Education") %>%
hc_xAxis(title = list(text = "Country Name")) %>%
hc_yAxis(title = list(text = "Cluster"))The countries similar to Iran Education-wise are:
n <- k5Clus$Cluster[which(k5Clus$Country.Code=="IRN")]
k5Clus %>% filter(.,Cluster==n) -> temp
knitr::kable(temp[1:10,])| Country.Code | Country.Name | Cluster |
|---|---|---|
| ABW | Aruba | 3 |
| ALB | Albania | 3 |
| ARE | United Arab Emirates | 3 |
| ARG | Argentina | 3 |
| AZE | Azerbaijan | 3 |
| BHR | Bahrain | 3 |
| BHS | Bahamas, The | 3 |
| BIH | Bosnia and Herzegovina | 3 |
| BLZ | Belize | 3 |
| BOL | Bolivia | 3 |
we reduce the data to the first PC. Growth factor is similar to what was defined before. Let’s see the growth factor explained above for Iran, USA and iceland.
wdiEd %>% filter(Country.Code=="IRN") -> IranEd
IranEd[is.na(IranEd)] <- 0
wdiEd %>% filter(Country.Code=="ISL") -> IceEd
IceEd[is.na(IceEd)] <- 0
wdiEd %>% filter(Country.Code=="USA") -> USEd
USEd[is.na(USEd)] <- 0
weights <- PCmatEd$rotation[,1]
IranEd <- (-(weights %*% as.matrix(IranEd[,6:58])) + (weights %*% as.matrix(IranEd[,5:57])))
USEd <- -(weights %*% as.matrix(USEd[,6:58])) + (weights %*% as.matrix(USEd[,5:57]))
IceEd <- -(weights %*% as.matrix(IceEd[,6:58])) + (weights %*% as.matrix(IceEd[,5:57]))
highchart() %>%
hc_xAxis(categories = 1960:2012) %>%
hc_add_series(name = "IRAN",data = t(IranEd)) %>%
hc_add_series(name = "USA",data = t(USEd)) %>%
hc_add_series(name = "Iceland",data = t(IceEd)) %>%
hc_title(text = "Education Growth") %>%
hc_add_theme(hc_theme_economist())As always, Iceland’s situation is amaizgly steady. Iran has had a drop in year 1989 which is 1368 in our calendar that can be explained by the “Imposed Wars” being started. In the last couple of years we don’t see that much progress in educational growth of these countries.
Back to the regression model, I’ll compute the growth between years 2009-2013, and then test it on years 2013/2014.
#selecting the columns
EdGrowth0 <- wdiEd[,c(1:4,54:60)]
EdGrowth0[is.na(EdGrowth0)] <- 0
numberOfCountries = length(levels(factor(EdGrowth0$Country.Code)))
#creating an empty dataset to fill it with growth values later
EdGrowth1 = matrix(nrow = numberOfCountries,ncol = 8)
EdGrowth1 = as.data.frame(EdGrowth1)
colnames(EdGrowth1) <- colnames(EdGrowth0)[c(2,5:11)]
EdGrowth1[,"Country.Code"] = levels(factor(EdGrowth0$Country.Code))
# calculating the annual growth for each country
for (i in 1:numberOfCountries) {
temp <- filter(EdGrowth0,Country.Code==as.character(levels(factor(wdiEd$Country.Code))[i]))
EdGrowth1[i,2:8] <- - weights %*% as.matrix(temp[,5:11])
}
#fitting the linear model
EdGrowth1 %>% mutate(Growth = 2*(X2014-X2013)) -> EdGrowth1
EdGrowth1[is.na(EdGrowth1)] <- 0
EdGrowthFit <- lm(Growth~X2009+X2010+X2011+X2012+X2013,EdGrowth1)
summary(EdGrowthFit)##
## Call:
## lm(formula = Growth ~ X2009 + X2010 + X2011 + X2012 + X2013,
## data = EdGrowth1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10996463 24840 100376 108600 4532026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.086e+05 7.272e+04 -1.493 0.13676
## X2009 7.236e+00 2.311e+00 3.131 0.00197 **
## X2010 -4.840e+00 2.852e+00 -1.697 0.09107 .
## X2011 3.606e+00 2.991e+00 1.206 0.22921
## X2012 -5.700e+00 2.717e+00 -2.098 0.03699 *
## X2013 -4.256e-01 7.059e-01 -0.603 0.54720
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1082000 on 230 degrees of freedom
## Multiple R-squared: 0.819, Adjusted R-squared: 0.8151
## F-statistic: 208.2 on 5 and 230 DF, p-value: < 2.2e-16
Let’s predict iran’s ranking in 2014/2015 growth factor.
# loading the test data and changing the names so that the years are similar to the model we used
testdata<- EdGrowth1[,3:7]
colnames(testdata) <- colnames(EdGrowth1)[2:6]
EdGrowth1 <- data.frame(EdGrowth1,predict(EdGrowthFit,testdata))
EdGrowth1<- EdGrowth1[order(desc(EdGrowth1$predict.EdGrowthFit..testdata.)),]
which(EdGrowth1$Country.Code=="IRN")## [1] 28
It’s the 28th Country, Great news!